{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "source": [
    "import pandas as pd"
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "source": [
    "# class_dict = {'OBJECTIVE': 0, 'METHODS': 1, 'RESULTS': 2, 'CONCLUSIONS': 3, 'BACKGROUND': 4}\n",
    "\n",
    "\n",
    "def get_data(file_from):\n",
    "    data = {\"label\":[],\"ques\":[]}\n",
    "    with open(file_from, 'r', encoding='utf-8') as f:\n",
    "        sentences, tags = [],[]\n",
    "        for line in f.readlines():\n",
    "            line = line.strip()\n",
    "            if not line:\n",
    "                if len(sentences) != 0:\n",
    "                    i = 0\n",
    "                    for sen in sentences:\n",
    "                        # fw.write(str(class_dict[tags[i]]) + '\\t' + sen + '\\n')\n",
    "                        if tags[i] and sen:\n",
    "                            data[\"label\"].append(tags[i])\n",
    "                            data[\"ques\"].append(sen)\n",
    "                        i += 1\n",
    "                sentences, tags = [], []\n",
    "            elif not line.startswith(\"###\"):\n",
    "                ls = line.split('\\t')\n",
    "                tag, sen = ls[0], ls[1]\n",
    "                sentences.append(sen)\n",
    "                tags.append(tag)\n",
    "    return data"
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "source": [
    "data = get_data(\"/data/leo/Projects/HSLN-Joint-Sentence-Classification/data/pubmed_refind_28w/test_clean.txt\")\n",
    "df = pd.DataFrame(data)\n",
    "df.to_csv(\"data/pubmed_refind_28w_test.csv\",index=False)"
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "source": [
    "data = get_data(\"/data/leo/Projects/HSLN-Joint-Sentence-Classification/data/pubmed_refind_28w/dev_clean.txt\")\n",
    "df = pd.DataFrame(data)\n",
    "df.to_csv(\"data/pubmed_refind_28w_dev.csv\",index=False)"
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "source": [
    "data = get_data(\"/data/leo/Projects/HSLN-Joint-Sentence-Classification/data/pubmed_refind_28w/train_clean.txt\")\n",
    "df = pd.DataFrame(data)\n",
    "df.to_csv(\"data/pubmed_refind_28w_train.csv\",index=False)"
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "source": [
    "class_dict = {'OBJECTIVE': 0, 'METHODS': 1, 'RESULTS': 2, 'CONCLUSIONS': 3, 'BACKGROUND': 4}\n",
    "df = pd.read_csv(\"data/pubmed_refind_28w_dev.csv\")\n",
    "df['label'] = df['label'].map(class_dict)\n",
    "df.to_csv('')"
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "source": [
    "df"
   ],
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>label</th>\n",
       "      <th>ques</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>4</td>\n",
       "      <td>The safety of psoralen plus ultraviolet A (PUV...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>4</td>\n",
       "      <td>A few multiple-center cooperative studies have...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>4</td>\n",
       "      <td>In our institute, more than @ patients have be...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0</td>\n",
       "      <td>We investigated the incidence of skin cancer a...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1</td>\n",
       "      <td>This is a historical cohort study of two compa...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29479</th>\n",
       "      <td>2</td>\n",
       "      <td>The most commonly reported conditions in the s...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29480</th>\n",
       "      <td>2</td>\n",
       "      <td>Tone and motor problems were the most commonly...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29481</th>\n",
       "      <td>3</td>\n",
       "      <td>The evidence for symptom management in Q@ cond...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29482</th>\n",
       "      <td>3</td>\n",
       "      <td>The evidence is dispersed in the literature an...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29483</th>\n",
       "      <td>3</td>\n",
       "      <td>More research needs to be done in these condit...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>29484 rows × 2 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "       label                                               ques\n",
       "0          4  The safety of psoralen plus ultraviolet A (PUV...\n",
       "1          4  A few multiple-center cooperative studies have...\n",
       "2          4  In our institute, more than @ patients have be...\n",
       "3          0  We investigated the incidence of skin cancer a...\n",
       "4          1  This is a historical cohort study of two compa...\n",
       "...      ...                                                ...\n",
       "29479      2  The most commonly reported conditions in the s...\n",
       "29480      2  Tone and motor problems were the most commonly...\n",
       "29481      3  The evidence for symptom management in Q@ cond...\n",
       "29482      3  The evidence is dispersed in the literature an...\n",
       "29483      3  More research needs to be done in these condit...\n",
       "\n",
       "[29484 rows x 2 columns]"
      ]
     },
     "metadata": {},
     "execution_count": 5
    }
   ],
   "metadata": {}
  }
 ],
 "metadata": {
  "orig_nbformat": 4,
  "language_info": {
   "name": "python",
   "version": "3.6.12",
   "mimetype": "text/x-python",
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "pygments_lexer": "ipython3",
   "nbconvert_exporter": "python",
   "file_extension": ".py"
  },
  "kernelspec": {
   "name": "python3",
   "display_name": "Python 3.6.12 64-bit ('py36': conda)"
  },
  "interpreter": {
   "hash": "2e2ff3a457722a20f87dbf10c05994872f65588779806bce29ecd514429d1c22"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}